### Configuration
################################################################################
### R Options
nodeInfo = system(paste0("scontrol show node ", Sys.info()["nodename"]), intern = TRUE)
cpus = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "cpu="))[2], ","))[1])
memInKB = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "mem="))[2], "M,"))[1]) * 1024^2

options(stringsAsFactors=FALSE,
        dplyr.summarise.inform=FALSE, 
        future.globals.maxSize=min(memInKB, 2 * 1024^3),
        mc.cores=min(cpus,1),
        future.fork.enable=TRUE, future.plan="multicore",
        future.rng.onMisuse="ignore")

### Fuctions
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_analysis.R", 
                        "functions_biomart.R", 
                        "functions_degs.R", 
                        "functions_io.R", 
                        "functions_plotting.R", 
                        "functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))

### Debugging mode
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions 
param$debugging_mode = "default_debugging"
switch (param$debugging_mode, 
        default_debugging=on_error_default_debugging(), 
        terminal_debugger=on_error_start_terminal_debugger(),
        print_traceback=on_error_just_print_traceback(),
        on_error_default_debugging())
### Rmarkdown configuration
################################################################################

### Create output directories
if (!file.exists(file.path(param$path_out, "figures"))) dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
if (!file.exists(file.path(param$path_out, "data"))) dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)


### R Options
options(citation_format="pandoc", 
        knitr.table.format="html",
        kableExtra_view_html=TRUE)


### Knitr default options
knitr::opts_chunk$set(echo=TRUE,                     # output code
                      cache=FALSE,                   # do not cache results
                      message=TRUE,                  # show messages
                      warning=TRUE,                  # show warnings
                      tidy=FALSE,                    # do not auto-tidy-up code
                      fig.width=10,                  # default fig width in inches
                      class.source='fold-hide',      # by default collapse code blocks
                      dev=c('png', 'svg', 'tiff'),          # create figures in png, tiff, and svg; the first device (png) will be used for HTML output
                      dev.args=list(png = list(type="cairo"),     # png: use cairo - works on cluster
                                    tiff = list(type="cairo", compression = 'zip')),  # tiff: use cairo - works on cluster, transparent background, compression type "zip" is ‘deflate (Adobe-style)’ 
                      dpi=300,                       # figure resolution
                      fig.retina=2,                 # retina multiplier
                      fig.path=paste0(file.path(param$path_out, "figures"), "/")  # Path for figures in png and pdf format (trailing "/" is needed)
)

### Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)

### Required libraries
library(magrittr)

### Set the bibliographic environment
# Clear previous bibliography
knitcitations::cleanbib()

# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
rcug_ref = knitcitations::read.bibtex(file.path(param$path_to_git,"assets","rcug_references.bib"))
invisible(knitcitations::citep(rcug_ref))

### Figure heights
# Single figure, landscape
fig_standard_height = 4
# Two plots alongside (e.g. umaps)
fig_standard2_height = 5
# Three plots alongside (e.g. umaps)
fig_standard3_height = 4
# Four plots alongside (e.g. umaps)
fig_standard4_height = 2.5
# Four plots 2x2 (e.g. umaps)
fig_patchwork4_height = fig_standard2_height * 2
# Six plots 2x3 (e.g. umaps)
fig_patchwork6_height = fig_standard3_height * 2
# Eight plots 4x2 (e.g. umaps)
fig_patchwork8_height = fig_standard4_height * 2
# Twelve plots 4x3 (e.g. umaps)
fig_patchwork12_height = fig_standard4_height * 3
# Sixteen plots 4x4 (e.g. umaps)
fig_patchwork16_height = fig_standard4_height * 4
# Load renv and virtualenvs
renv::load(file.path(param$path_to_git,"env/basic"))
renv::use_python(type = "virtualenv", name = file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))
#reticulate::use_virtualenv(file.path(param$path_to_git,"env/basic/virtualenvs/r-reticulate"))

# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(knitr)

Read data

Read gene annotation

Gene annotation including Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names, are loaded from a pre-prepared reference file or Ensembl.

## Read gene annotation
# We read gene annotation from file. 
# We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat gene names. 

### Set reference
################################################################################
if (param$species=="human") {
  param$mart_dataset="hsapiens_gene_ensembl"
  param$mt = "^MT-"
} else {
  if (param$species=="mouse") {
    param$mart_dataset="mmusculus_gene_ensembl"
    param$mt = "^mt-"
  } else {
    param$mart_dataset=param$mart_dataset
  }
}

if (is.null(param$annot_version)) {
  param$annot_version=98
} else {
  param$annot_version=param$annot_version
}

param$path_reference=file.path(param$path_to_git, "references", param$mart_dataset, param$annot_version)
param$reference=paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt")
param$file_annot = file.path(param$path_reference, param$reference)
param$file_cc_genes = file.path(param$path_reference, "cell_cycle_markers.xlsx")


### Download reference if not existing
################################################################################
if (!file.exists(param$file_annot) | !file.exists(param$file_cc_genes)) {
  source(file.path(param$path_to_git, "modules/download_references/download_references.R"))
}


### read_ensembl_annotation
################################################################################

# Load from file
annot_ensembl = read.delim(param$file_annot)

# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
  stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}

# Translate IDs
IDs_out = suppressWarnings(TranslateIDs(annot_ensembl, param$annot_main)) 
ensembl_to_seurat_rowname = IDs_out[[1]]
seurat_rowname_to_ensembl = IDs_out[[2]]
seurat_rowname_to_entrez = IDs_out[[3]]
annot_ensembl = IDs_out[[4]]


### read_cc_genes
################################################################################
# Use biomart to translate human cell cycle genes to the species of interest

# Load from file
genes_s = openxlsx::read.xlsx(param$file_cc_genes, sheet=1)
genes_g2m = openxlsx::read.xlsx(param$file_cc_genes, sheet=2)

# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))

Read scRNA-seq data

The workflow is appicable to single cell and nuclei RNAseq data pre-processed via 10x Genomics or SmartSeq-2 or for other data that are represented by a simple table with transcript counts per gene and cell. In the first step, a Seurat object of the data is generated and subsequently some metadata are added. Similarly, a Seurat object can be loaded to inspect the stored scRNA seq data.

Pre-processing of 10x data with Cell Ranger We use the 10x Genomics Cell Ranger software to map 10x sequencing reads. The result is a count matrix that contains the number of unique transcripts per gene (rows) and cell (columns). To save storage space, the matrix is converted into a condensed format and described by the following 3 files:
  • “features.tsv.gz”: Tabular file with gene annotation (Ensembl gene ID and the gene symbol) to identify genes
  • “barcodes.tsv.gz”: File with cell barcodes to identify cells
  • “matrix.mtx.gz”: A condensed version of the count matrix
Pre-processing of SmartSeq-2 data Sequencing reads from SmartSeq-2 (and other) experiments can be mapped with any suitable mapping software as long as a count matrix is generated that contains the number of mapped reads per gene (rows) and cell (columns). The first row of the count matrix should contain cell barcodes or names, the first column of the matrix should contain Ensembl gene IDs.
What is Seurat and which information is contained in a Seurat object? Seurat is an R-package that is used for the analysis of single-cell RNA-seq data. We read pre-processed data as described above, and convert it to a count matrix in R. In the case of 10x data, the count matrix contains the number of unique RNA molecules (UMIs) per gene and cell. In the case of SmartSeq-2 data, the count matrix contains the number of reads per gene and cell. Seurat v5 assays store data in layers. These layers can store raw, un-normalized counts (layer=‘counts’), normalized data (layer=‘data’), or z-scored/variance-stabilized (scaled) data (layer=‘scale.data’).
In addition, the workflow stores additional information in the Seurat object, including but not limited to dimensionality reduction and cluster results.

Here, for the project Testdata, the following data are analysed:

### Read rds object
################################################################################

### Load Seurat S4 objects 
# Test if file is defined
if (is.null(param$data)) {
  message("Dataset is not specified")
} else {
  # Test if file exists
  if (file.exists(param$data)) {
    # Read object
    message(paste0("Load dataset:", param$data))
    sc = base::readRDS(param$data)
    
    # Transfer original params to loaded object
    if ("parameters" %in% list_names(sc@misc[])) {
      orig_param = sc@misc$parameters
      
      param_keep = param[c("path_to_git", "scriptname", "author", "project_id", "data", "path_out", "file_annot", "file_cc_genes")]
      param = modifyList(x = param, val = orig_param)
      param = modifyList(x = param, val = param_keep)
      param = modifyList(x = param, val = param_advset)
    }
    
    ### Set colors
    # Set sample colors based on orig.ident
    if (!is.null(sc@meta.data[["orig.ident"]])) {
      if (length(unique(sc@meta.data[["orig.ident"]]))!=length(param$col_samples)) {
        message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
        sample_colours_out = suppressWarnings(SetSampleColours(sc, param$col_palette_samples)) 
        sc = sample_colours_out[[1]]
        param$col_samples = sample_colours_out[[2]]
      }
    }
 
    # Set colors for clusters based on seurat_clusters 
    if (!is.null(sc@meta.data[["seurat_clusters"]])) {
      if (length(unique(sc@meta.data[["seurat_clusters"]]))!=length(param$col_clusters)) {
        message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
        cluster_colours_out = suppressWarnings(SetClusterColours(sc, param$col_palette_clusters)) 
        sc = cluster_colours_out[[1]]
        param$col_clusters = cluster_colours_out[[2]]
      }
    }

    # Set colors for cell types based on annotation 
    if (!is.null(sc@meta.data[["annotation"]])) {
      if (length(unique(sc@meta.data[["annotation"]]))!=length(param$col_annotation)) {
        message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
        annotation_colours_out = suppressWarnings(SetAnnotationColours(sc, param$col_palette_annotation)) 
        sc = annotation_colours_out[[1]]
        param$col_annotation = annotation_colours_out[[2]]
      }
    }

  sc
  } else {
  message("Dataset does not exist")
  }
}


### Load reference Seurat S4 objects if specified
# Test if file is defined
if (!is.null(param$refdata)) {
  # Test if file exists
  if (file.exists(param$refdata)) {
   # Read object
    message(paste0("Load dataset:", param$refdata)) 
    scR = base::readRDS(param$refdata)
    
    # Transfer original params to loaded object
    if ("parameters" %in% list_names(scR@misc[])) {
      orig_paramR = scR@misc$parameters
      
      if (!is.null(orig_paramR$col_samples)) {
        param$col_samples_ref = orig_paramR$col_samples
      }
      if (!is.null(orig_paramR$col_clusters)) {
        param$col_clusters_ref = orig_paramR$col_clusters
      }
      if (!is.null(orig_paramR$col_annotation)) {
        param$col_annotation_ref = orig_paramR$col_annotation
      }
      param = modifyList(x = param, val = param_advset)
    }
    
    ### Set colors
    # Set sample colors based on orig.ident
    if (!is.null(scR@meta.data[["orig.ident"]])) {
      if (length(unique(scR@meta.data[["orig.ident"]]))!=length(param$col_samples_ref)) {
        message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
        sample_colours_out = suppressWarnings(SetSampleColours(scR, param$col_palette_samples)) 
        scR = sample_colours_out[[1]]
        param$col_samples_ref = sample_colours_out[[2]]
      }
    }
    
    # Set colors for clusters based on seurat_clusters 
    if (!is.null(scR@meta.data[["seurat_clusters"]])) {
      if (length(unique(scR@meta.data[["seurat_clusters"]]))!=length(param$col_clusters_ref)) {
        message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
        cluster_colours_out = suppressWarnings(SetClusterColours(scR, param$col_palette_clusters)) 
        scR = cluster_colours_out[[1]]
        param$col_clusters_ref = cluster_colours_out[[2]]
      }
    }
    
    # Set colors for cell types based on annotation 
    if (!is.null(scR@meta.data[["annotation"]])) {
      if (length(unique(scR@meta.data[["annotation"]]))!=length(param$col_annotation_ref)) {
        message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
        annotation_colours_out = suppressWarnings(SetAnnotationColours(scR, param$col_palette_annotation)) 
        scR = annotation_colours_out[[1]]
        param$col_annotation_ref = annotation_colours_out[[2]]
      }
    }
    
  scR
  } else {
  message("Reference dataset does not exist")
  }
}  
# Transfer object into a list
sc_original = sc
sc = list()
n = param$project_id
sc[[n]] = sc_original
# Download test dataset 
param$path_test_dataset=paste0(param$path_to_git, "/modules/download_test_datasets/", param$download_test_datasets, ".R")
if (file.exists(param$path_test_dataset)) {
  message(paste0("Using test dataset '", gsub('download_','', param$download_test_datasets), "'."))
  # Data output in a data subfolder of the directory where it is run 
  # Create output directories
  if (!file.exists("data")) dir.create("data", recursive=TRUE, showWarnings=FALSE)
  setwd(file.path(param$path_to_git,"data"))
  source(param$path_test_dataset)
  param$path_data = data.frame(name=list.dirs(path = file.path(param$path_to_git,"data", "counts"),
                                              full.names = FALSE, recursive = FALSE), 
                                type=c("10x"),
                                path=list.dirs(path = file.path(param$path_to_git,"data", "counts"),
                                              full.names = TRUE, recursive = FALSE))
} else {
  message("Test dataset does not exist.")
}
# List of Seurat objects
sc = list()

datasets = param$path_data
for (i in seq(nrow(datasets))) {
  name = datasets[i, "name"]
  type = datasets[i, "type"]
  path = datasets[i, "path"]
  suffix = datasets[i, "suffix"]
  
  # Read 10X or smartseq2
  if (type == "10x") {
    # Read 10X sparse matrix into a Seurat object
    sc = c(sc, ReadSparseMatrix(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, cellnames_suffix=suffix))
    
  } else if (type == "smartseq2") {
    # Read counts table into a Seurat object
    sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE, cellnames_suffix=suffix))
  } 
}

# Make sure that sample names are unique. If not, just prefix with the dataset name. Also set orig.ident to this name.
sample_names = names(sc)
duplicated_sample_names_idx = which(sample_names %in% sample_names[duplicated(sample_names)])
for (i in duplicated_sample_names_idx) {
  sample_names[i] = paste(head(sc[[i]][["orig.dataset", drop=TRUE]], 1), sample_names[i], sep=".")
  sc[[i]][["orig.ident"]] = sample_names[i]
}

# Set up colors for samples and add them to the sc objects
sample_names = purrr::flatten_chr(purrr::map(sc, function(s) {
  nms = unique(as.character(s[[]][["orig.ident"]]))
  return(nms) 
}))
param$col_samples = GenerateColours(num_colours=length(sample_names), names=sample_names, palette=param$col_palette_samples, alphas=1)
sc = purrr::map(sc, ScAddLists, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")

message("Read scRNA-seq data into Seurat object")
sc
## $sample1
## An object of class Seurat 
## 8000 features across 500 samples within 1 assay 
## Active assay: RNA (8000 features, 0 variable features)
##  1 layer present: counts
## 
## $sample2
## An object of class Seurat 
## 8000 features across 700 samples within 1 assay 
## Active assay: RNA (8000 features, 0 variable features)
##  1 layer present: counts
### Calculate percentages of mitochondrial and ribosomal genes as well as ERCC (qc_calculate_cells)
# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, function(s) {
  if (!("percent_mt" %in% colnames(s@meta.data))) {
    mt_features = grep(pattern=param$mt, rownames(s), value=TRUE)
    return(Seurat::PercentageFeatureSet(s, features=mt_features, col.name="percent_mt", assay=param$assay_raw))
  } else {
    return(s)
  }
})

# Calculate percentage of counts in ribosomal genes for each Seurat object
sc = purrr::map(sc, function(s) {
  if (!("percent_ribo" %in% colnames(s@meta.data))) {
    ribo_features = grep(pattern="^RP[SL]", rownames(s), value=TRUE, ignore.case=TRUE)
    return(Seurat::PercentageFeatureSet(s, features=ribo_features, col.name="percent_ribo", assay=param$assay_raw))
  } else {
    return(s)
  }
})

# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
  if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
  return(s)
})

# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s) return(s[[]])) %>% as.data.frame())

# Names of all available QC metrics
cell_qc_features = c(paste0(c("nFeature_", "nCount_"), param$assay_raw), "percent_mt")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)




### Expand filter thresholds to all samples (qc_criteria_create)
# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
  if (s %in% names(param$cell_filter)) {
    return(param$cell_filter[[s]])
  } else {
    return(param$cell_filter)
  }
})

param$feature_filter = purrr::map(list_names(sc), function(s) {
  if (s %in% names(param$feature_filter)) {
    return(param$feature_filter[[s]])
  } else {
    return(param$feature_filter)
  }
})

# List of all filter thresholds per QC metrics
cell_qc_thresholds = purrr::map(cell_qc_features, function(m) {
  tresh = purrr::map_dfr(names(param$cell_filter), function(n) {
    if (m %in% names(param$cell_filter[[n]])) {
      t = data.frame(orig.ident=n, min=param$cell_filter[[n]][[m]][1], max=param$cell_filter[[n]][[m]][2]) %>% 
        tidyr::pivot_longer(cols=2:3, names_to=c("threshold")) %>%
        dplyr::filter(!is.na(value))
      t$threshold = factor(t$threshold, levels=c("min", "max"))
      return(t)
    }
  })
})



### Calculate diverse feature caracteristics (qc_calculate_features)
# Only RNA assay at the moment
# counts_median uses sapply on the counts matrix, which converts the sparse matrix into a normal matrix
#   This might have to be adapted in future (Sparse Matrix Apply Function)
sc = purrr::map(list_names(sc), function(n) {
  # Calculate percentage of counts per gene in a cell
  counts_rna = Seurat::GetAssayData(sc[[n]], layer="counts", assay=param$assay_raw)
  total_counts = sc[[n]][[paste0("nCount_", param$assay_raw), drop=TRUE]]
  counts_rna_perc = Matrix::t(Matrix::t(counts_rna)/total_counts)*100
  
  # Calculate feature filters
  num_cells_expr = Matrix::rowSums(counts_rna >= 1)
  num_cells_expr_threshold = Matrix::rowSums(counts_rna >= param$feature_filter[[n]][["min_counts"]])
  
  # Calculate median of counts_rna_perc per gene 
  counts_median = apply(counts_rna_perc, 1, median)
  
  # Add all QC measures as metadata
  sc[[n]][[param$assay_raw]] = Seurat::AddMetaData(sc[[n]][[param$assay_raw]], data.frame(num_cells_expr, num_cells_expr_threshold, counts_median))
  return(sc[[n]])
})

The following first table shows available metadata (columns) of the first 5 cells (rows). These metadata provide additional information about the cells in the dataset, such as the sample a cell belongs to (“orig.ident”), the number of mapped reads (“nCounts_RNA”), the number of unique genes detected (“nFeature_RNA”), or percentage of mitochondrial genes (“percent_mt”).
The second table shows available metadata (columns) of the first 5 genes (rows).

# Print cell metadata
knitr::kable(head(sc_cell_metadata), align="l", caption="Cell metadata, top 5 rows") %>%
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))  %>% 
  kableExtra::scroll_box(width="100%")
Cell metadata, top 5 rows
orig.ident nCount_RNA nFeature_RNA orig.dataset percent_mt percent_ribo
AACGAAACAGCGTTTA-1 sample1 1901 682 sample1 14.571278 11.888480
GCTTCACCAACACGAG-1 sample1 9196 2070 sample1 8.231840 19.214876
TAGTGCACAGAGGAAA-1 sample1 7520 1822 sample1 7.140957 17.433511
AGAGCAGTCGCGCTGA-1 sample1 2611 902 sample1 13.481425 32.669475
TGTTCATGTCCTGGGT-1 sample1 2658 937 sample1 14.522197 27.990971
CTTGAGATCCCGAGGT-1 sample1 287 41 sample1 86.411150 4.529617
# Print gene metadata
knitr::kable(head(sc[[1]][[param$assay_raw]][[]], 5), align="l", caption="Feature metadata, top 5 rows (only first dataset shown)") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))  %>% 
  kableExtra::scroll_box(width="100%")
Feature metadata, top 5 rows (only first dataset shown)
feature_id feature_name feature_type num_cells_expr num_cells_expr_threshold counts_median
AL627309.1 ENSG00000238009 AL627309.1 Gene Expression 1 1 0
LINC01409 ENSG00000237491 AL669831.5 Gene Expression 32 32 0
FAM41C ENSG00000230368 FAM41C Gene Expression 17 17 0
KLHL17 ENSG00000187961 KLHL17 Gene Expression 12 12 0
PLEKHN1 ENSG00000187583 PLEKHN1 Gene Expression 1 1 0




Quality control

Quality control (QC) is an important step of the pre-processing workflow. Here we assess the cell quality, i.e. the cell viability and duplicate rate to determine filter parameter.

Why is pre-processing so important? Cells may have been damaged during sample preparation and might be only partially captured in the sequencing. In addition, free-floating mRNA from damaged cells can be encapsulated, adding to the background noise. The low-quality cells and free-floating mRNA interfere with downstream analyses. Therefore, cells and genes are filtered based on defined quality metrics. Data normalization eliminates cell-specific biases such as the absolute number of reads per cell, allowing us to systematically compare cells afterwards. Subsequent scaling corrects for the fact that genes have different lengths, allowing us to compare genes afterwards. Lastly, highly variable genes are determined, reducing computational overhead and noise from genes that are not interesting.

Determining filter thresholds

These commonly used QC metrics, namely the number of unique genes detected in each cell (“nFeature_”), the total number of molecules detected in each cell (“nCount_”), and the percentage of counts that map to the mitochrondrial genome (“percent_mt”), can be used to infer filter thresholds. If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown (“percent_ercc”).

Which cells can be considered to be of low quality?

In cell QC these covariates are filtered via thresholding as they are a good indicator of cell viability. However, it is crucial to consider the three QC covariates jointly as otherwise it might lead to misinterpretation of cellular signals.

  • Cells with very high number of genes should possibly be excluded as those might be duplicates. Since the total number of reads detected within a cell typically strongly correlates with the number of unique genes, we can focus on the number of unique genes for filtering.
    • Cells with very low count depth might be droplets with ambient RNA. If the dataset contains a high number of such events, it is advisable to perform additionally upstream correction for ambient RNA e.g. via SoupX r knitcitations::citet("https://doi.org/10.1093/gigascience/giaa151") as ambient RNA can confound the number of observed counts and adds background noise to the data.
    • Low-quality cells such as dying, degraded and damaged cells will also often have very few genes (“nFeature_”) and low total counts (“nCount_”). In addition, damaged cells often exhibit high mitochondrial (“percent_mt”) or spike-in (“percent_ercc”) content. As mitochondria have their own membranes, their RNA is often the last to degrade in damaged cells and can thus be found in high numbers. However, it is important to keep in mind that different cell types and cells isolated from various species may differ in their number of expressed genes and metabolism. For example, stem cells may express a higher number of unique genes, and metabolically active cells may express a higher number of mitochondrial genes. Hence, it is crucial to consider the three QC covariates jointly as otherwise it might lead to misinterpretation of cellular signals. E.g. a cell with a high fraction of mitochondrial counts might be a dying cell with a broken membrane in combination with a low number of counts, but in combination with a intermediate to high count depth it might be a cell involved in respiratory processes.
Impact of low-quality cells on downstream analyses First of all, low-quality cells of different cell types might cluster together due to similarities in their damage-induced gene expression profiles. This leads to artificial intermediate states or trajectories between subpopulations that would otherwise be distinct. Second, low-quality cells might mask relevant gene expression changes. The main differences captured by the first principal components will likely be based on cell quality rather than the underlying biology, reducing the power of dimensionality reduction. Likewise, highly variable genes might just be genes that differ between low- and high-quality cells. Lastly and equally important, the low number of total counts in low-quality cells might lead to artificial upregulation of genes due to wrong scaling effects during the normalisation process.
# Create plot per QC metric
p_list = list()
for (m in cell_qc_features) {
  p_list[[m]]= ggplot(sc_cell_metadata[, c("orig.ident", m)], aes(x=.data[["orig.ident"]], y=.data[[m]], fill=.data[["orig.ident"]], group=.data[["orig.ident"]])) +
    geom_violin(scale="width")

  # Adds points for samples with less than three cells since geom_violin does not work here
  p_list[[m]] = p_list[[m]] + 
    geom_point(data=sc_cell_metadata[, c("orig.ident", m)] %>% dplyr::filter(orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes(x=.data[["orig.ident"]], y=.data[[m]], fill=.data[["orig.ident"]]), shape=21, size=2)
  
  # Now add style
  p_list[[m]] = p_list[[m]] + 
    AddStyle(title=m, legend_position="none", fill=param$col_samples, xlab="") + 
    theme(axis.text.x=element_text(angle=45, hjust=1))
  
  # Add filter threshold as segments to plot; min threshold lines are dashed and max threshold lines are twodashed
  if (nrow(cell_qc_thresholds[[m]]) > 0) {
    p_list[[m]] = p_list[[m]] + geom_segment(data=cell_qc_thresholds[[m]], 
                                             aes(x=as.integer(as.factor(orig.ident))-0.5, 
                                                 xend=as.integer(as.factor(orig.ident))+0.5, 
                                                 y=value, yend=value, lty=threshold), colour="firebrick") +
      scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min")))
  }
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values") 
p

# Correlate QC metrics for cells
p_list = list()
sc_cell_metadata_plot_order = sample(1:nrow(sc_cell_metadata))

# nFeature vs nCount
m = paste0(c("nCount_", "nFeature_"), param$assay_raw)
p_list[[1]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]])) +
  geom_point(size = param$pt_size) + 
  scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
  AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
  p_list[[1]] = p_list[[1]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
    
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
  p_list[[1]] = p_list[[1]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}
  

# nFeature vs percent_mt
m = c("percent_mt", paste0(c("nFeature_"), param$assay_raw))
p_list[[2]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]])) +
  geom_point(size = param$pt_size) +
  scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) +
  AddStyle(col=param$col_samples)
if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
  p_list[[2]] = p_list[[2]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
}
if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
  p_list[[2]] = p_list[[2]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
}

# nFeature vs percent_ercc (if available)
if ("percent_ercc" %in% names(cell_qc_features)) {
  m = c("percent_ercc", paste0(c("nFeature_"), param$assay_raw))
  p_list[[3]] = ggplot(sc_cell_metadata[sc_cell_metadata_plot_order, , drop=FALSE], aes(x=.data[[m[1]]], y=.data[[m[2]]], colour=.data[["orig.ident"]])) +
    geom_point(size = param$pt_size) + 
    scale_linetype_manual(values=setNames(c("dashed", "F1"), c("max", "min"))) + 
    AddStyle(col=param$col_samples)
  if (nrow(cell_qc_thresholds[[m[1]]]) > 0) {
    p_list[[3]] = p_list[[3]] + geom_vline(data=cell_qc_thresholds[[m[1]]], aes(xintercept=value, lty=threshold), colour="firebrick")
  }
  if (nrow(cell_qc_thresholds[[m[2]]]) > 0) {
    p_list[[3]] = p_list[[3]] + geom_hline(data=cell_qc_thresholds[[m[2]]], aes(yintercept=value, lty=threshold), colour="firebrick")
  }
}

# Combine plots
p = patchwork::wrap_plots(p_list, ncol=length(p_list)) + patchwork::plot_annotation("QC covariates scatter plots") 
if (length(p_list) == 1) {
  p = p & theme(legend.position="bottom") 
} else {
  p = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom") 
}
p


To save time and computational resources, we down-sample the cell number before continuing with further quality assessment. According to provided number of samples the cell number is down-sampled to 1000 cells (1 sample), 500 (2-3 samples), or 300 (4 and more samples).

# downsample 
# according to samples number to 1000 cells (1 sample), 500 (2-3 samples), or 300 (4 and more samples)

if (length(sc) >= 4) {
  param$downsample_cells_n = 300
} else {
  if (length(sc) > 1) {
    param$downsample_cells_n = 500
  } else {
    param$downsample_cells_n = 1000
  }
}

# downsample_cells_n overwrites downsample_cells_equally
if (!is.null(param$downsample_cells_n)) {
  n = param$downsample_cells_n
} else if (param$downsample_cells_equally) {
  n = purrr::map_int(sc, ncol) %>% min()  
}

# Actual downsampling
if (!is.null(param$downsample_cells_n) | param$downsample_cells_equally) {
  sc = purrr::map(sc, function(s) {
    cells = ScSampleCells(sc=s, n=n, seed=1)
    return(subset(s, cells=cells))
  })
  
  # Adjust combined metadata accordingly
  sc_cell_metadata = sc_cell_metadata[unlist(purrr::map(sc, Cells)), ]
  
  message(paste0("Your data has been down-sampled to ", param$downsample_cells_n, " cells." ))
  print(sc)
}

× (Message)
Your data has been down-sampled to 500 cells.

## $sample1
## An object of class Seurat 
## 8000 features across 500 samples within 1 assay 
## Active assay: RNA (8000 features, 0 variable features)
##  1 layer present: counts
## 
## $sample2
## An object of class Seurat 
## 8000 features across 500 samples within 1 assay 
## Active assay: RNA (8000 features, 0 variable features)
##  1 layer present: counts


Genes with highest expression

We next plot the genes with the highest median percentage of counts, i.e. for each cell, we calculate the percentage of counts per gene and then calculate the median value of these percentages for each gene in all cells.
Often those genes are mitochondrial or ribosomal genes. Here rather the percentage of raw counts per gene in a cell and similarity between samples is of importance. A high percentage of of raw counts of those, presumably, less interesting genes, suggest to perform scaling of the data. Differences in percentage of raw counts of those genes between supposedly similar samples might be a indication for a batch effect.

# Plot only samples that we intend to keep 
sc_names = names(sc)[!(names(sc) %in% param$samples_to_drop)]
genes_highestExpr = lapply(sc_names, function(i) {
  top_ten_exp = sc[[i]][[param$assay_raw]][["counts_median"]] %>% dplyr::arrange(dplyr::desc(counts_median)) %>% head(n=10)
  return(rownames(top_ten_exp))
  }) %>%
  unlist() %>%
  unique()

genes_highestExpr_counts = purrr::map_dfc(sc[sc_names], .f=function(s) s[[param$assay_raw]][["counts_median"]][genes_highestExpr, ]) 
genes_highestExpr_counts$gene = genes_highestExpr
genes_highestExpr_counts = genes_highestExpr_counts %>% tidyr::pivot_longer(cols=all_of(sc_names))
genes_highestExpr_counts$name = factor(genes_highestExpr_counts$name, levels=sc_names)

col =  GenerateColours(num_colours=length(genes_highestExpr), names=genes_highestExpr, palette="ggsci::pal_simpsons")
p = ggplot(genes_highestExpr_counts, aes(x=name, y=value, col=gene, group=gene)) + 
  geom_point() + 
  AddStyle(title="Top 10 highest expressed genes per sample, added into one list", 
           xlab="Sample", ylab="Median % of raw counts\n per gene in a cell", 
           legend_position="bottom", 
           col=col)
if (length(unique(genes_highestExpr_counts$name))>1) p = p + geom_line()
p




Normalization and scaling

Dependent on the normalisation of your choice, we either:

  1. We perform standard log normalisation by running standard functions to select variable genes, and scale normalised gene counts.
  2. Run SCTransform, a new and more sophisticated normalisation method that replaces the previous functions (normalisation, variable genes and scaling).
What do we need normalisation for?

The number of raw sequencing reads per cell is influenced by technical differences in the capture, reverse transcription and sequencing of RNA molecules, particularly due to the difficulty of achieving consistent library preparation with minimal starting material. Thus, comparing gene expression between cells may reveal differences that are solely due to sampling effects. After low-quality cells were removed by filtering, the primary goal of normalization is to remove technical sampling effects while preserving the true biological signal. Hence, normalization of the raw counts accounts for differences in sequencing depth per cell.

The standard log normalisation, a count depth scaling, is the simplest and most commonly used normalization strategy. The underlying idea is that each cell initially contained an equal number of mRNA molecules, and differences arise due to sampling effects. For each cell, the number of reads per gene is divided by a cell-specific “size factor”, which is proportional to the total count depth of the cell. The resulting normalized data add up to 1 per cell, and is then typically multiplied by a factor of 10 (10,000 in this workflow). Finally, normalized data are log-transformed for three important reasons. First, distances between log-transformed expression data represent log fold changes. Log-transformation emphasizes contributions from genes with strong relative differences, for example a gene that is expressed at an average count of 50 in cell type A and 10 in cell type B rather than a gene that is expressed at an average count of 1100 in A and 1000 in B. Second, log-transformation mitigates the relation between mean and variance in the data. Lastly, log-transformation reduces that skewness of the data as many downstream analyses require the data to be normally distributed.
What do we need scaling for? To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score).
After normalization, gene expression data can be compared between cells. However, expression of individual genes still cannot be compared. Highly expresed genes can overpower the signal of other less expresed genes with equal importance (overdispersed count values). Moreover, in SmartSeq-2 data, due to different gene length, longer genes can be represented by a higher number of reads. To account for this effect, normalized data are further scaled using a z-transformation, resulting in the average expression of 0 and the variance of 1 for each gene across all cells. Note that additional unwanted sources of variations can be regressed out during the scaling process, such as cell cycle effects or the percentage of mitochondrial reads.
What is SCTransform special about? The standard log normalisation assumes that count depth influences all genes equally. However, it has been shown that the use of a single size factor will introduce different effects on highly and lowly expressed genes (r Cite("10.1186/s13059-019-1874-1")). SCTransform is a new statistical approach for the modelling, normalization and variance stabilization of single-cell RNA-seq data. SCTransform models the UMI counts (via a regularized negative binomial model) to remove the variation due to sequencing depth and adjusts the variance based on pooling information across genes with similar abundances and automatically regresses out sequencing depth (nCounts). Hence, SCTransform can be applied to 10x (contains UMI counts) but not SmartSeq-2 data. Additional unwanted sources of variations can be regressed out during SCTransform.

While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.

# Normalize data the original way
#   This is required to score cell cycle (https://github.com/satijalab/seurat/issues/1679)
if (!("data" %in% sc[[param$project_id]][["RNA"]][])) {
  source(file.path(param$path_to_git,'modules/pre-processing/normalization.R'))
}
message(paste0("Here, the ", ifelse(param$norm=="RNA","Standard log normalization","SCTransform")," method was used and no additional sources of variance regressed out."))

× (Message)
Here, the Standard log normalization method was used and no additional sources of variance regressed out.


Variable genes

We select and plot the top 3,000 variable genes. If the composition of the samples is overall similar, it would be expected that they roughly share the same variable genes.
Why selecting variable genes? For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly in others.
Experience shows that 1,000-2,000 genes with the highest cell-to-cell variation are often sufficient to describe the global structure of a single-cell dataset. For example, cell type-specific genes typically highly vary between cells. Housekeeping genes, on the other hand, are similarly expressed across cells and can be disregarded to differentiate between cells. Highly variable genes are typically the genes with a cell type specific expression profile, and are often the genes of interest in single-cell experiments. Housekeeping genes, with similar levels of expression across all cells, or genes with minor expression differences, might add random noise and mask relevant changes during downstream dimensionality reduction and clustering. We therefore aim to select a sensible set of variable genes that includes interesting biological signal and excludes noise. Here, the top 3,000 variable genes are selected and used for downstream analysis.
How are variable genes selected? To determine variable genes, we need to separate biological variability from technical variability. Technical variability arises especially for lowly expressed genes, where high variability corresponds to small absolute changes that we are not interested in.
standard log normalisation:
Here, we use the variance-stabilizing transformation (vst) method implemented in Seurat (r Cite("10.1186/s13059-019-1874-1")). This method first models the technical variability as a relationship between mean gene expression and variance using local polynomial regression. The model is then used to calculate the expected variance based on the observed mean gene expression. The difference between the observed and expected variance is called residual variance and likely reflects biological variability.
SCTransform:
Also SCTransform returns a list of top 3,000 variable genes.
fig_height_vf = 5 * ceiling(length(names(sc))/2)
# If VariableFeatures not yet present in object
# Find variable features from normalized data (unaffected by scaling)
if (!("scale.data" %in% sc[[param$project_id]][["RNA"]][]) & !param$norm=="SCT") {
  sc = purrr::map(sc, Seurat::FindVariableFeatures, selection.method="vst", nfeatures=3000, verbose=FALSE)
}

# Plot VariableFeaturePlot
p_list = purrr::map(list_names(sc), function(n) {
  top10 = head(Seurat::VariableFeatures(sc[[n]], assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw)), 10)
  p = Seurat::VariableFeaturePlot(sc[[n]], 
                                  assay=ifelse(param$norm=="SCT", param$norm, param$assay_raw), 
                                  selection.method=ifelse(param$norm=="RNA", "vst", "sct"), 
                                  col=c("grey", param$col), pt.size = param$pt_size) + 
    AddStyle(title=n) + 
    theme(legend.position=c(0.2, 0.8), legend.background=element_rect(fill=alpha("white", 0.0)))
  p = LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
  return(p)
})

p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Variable genes")
p

If multiple data sets are processed, we merge the datasets now.

if (length(sc) == 1) {
  # Default assay is set automatically
  sc = sc[[1]]
  message("Your dataset contains 1 sample only. No merging/integrating required.")
} else {
  source(file.path(param$path_to_git,'modules/pre-processing/combine_samples.R'))
}

× (Message)
Data values for all samples have been merged. This means that data values have been concatenated, not integrated.

## An object of class Seurat 
## 8000 features across 1000 samples within 1 assay 
## Active assay: RNA (8000 features, 3000 variable features)
##  3 layers present: data, counts, scale.data
##  1 dimensional reduction calculated: pca
# Only relevant if data loaded from rds, otherwise they would be scaled and dimensional reduced before
if (!("scale.data" %in% sc[["RNA"]][])) {
  # Scale (default)
  all_genes = rownames(sc)
  sc = suppressMessages(ScaleData(sc, features = all_genes))
} 
# Run pca with the variable genes
if (!("pca" %in% list_names(sc@reductions[]))) {
  # Run PCA for default normalization
  sc = suppressMessages(Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), verbose=FALSE, npcs=min(50, ncol(sc))))
}


Relative log expression

To better understand the efficiency of the applied normalization procedures, we plot the relative log expression of genes in 100 randomly selected cells per sample before and after normalization. This type of plot reveals unwanted variation in your data.
The concept is taken from Gandolfo and Speed (2018). In brief, we remove variation between genes by calculating for each gene its median expression across all cells, and then calculate the deviation from this median for each cell. For each cell, we plot the median expression (black), the interquartile range (lightgrey), whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers (#00468BFF, #ED0000FF).
If expression levels of most genes are supposed to be similar in all cell types, sample heterogeneity is a sign of unwanted variation.

n_cells_rle_plot = 100

# Sample at most 100 cells per dataset and save their identity
cells_subset = sc[["orig.ident"]] %>% tibble::rownames_to_column() %>% 
  dplyr::group_by(orig.ident) %>% 
  dplyr::sample_n(size=min(n_cells_rle_plot, length(orig.ident))) %>% 
  dplyr::select(rowname, orig.ident)

# Plot raw data
p1 = PlotRLE(as.matrix(log2(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=param$assay_raw, layer = "counts") + 1)), 
            id=cells_subset$orig.ident, 
            col=param$col_samples) + 
  labs(title="log2(raw counts + 1)")

p2 = PlotRLE(as.matrix(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=param$norm, layer = "data")), 
            id=cells_subset$orig.ident, 
            col=param$col_samples) + 
  labs(title="Normalised data")

p = p1 / p2
p




Investigating covariants

Next, we estimate the impact of covariates, such as the percentage of mitochondrial gene expression and cell cycle phase, to evaluate reliability of the applied normalization and scaling method, and determine possibly variables to regress out and integration method.

What are unwanted sources of variations that could be regressed out?

There can be sources of uninteresting variation in the data that is often specific to the dataset. Hence, checking and removing unwanted variation is another important step in pre-processing so that those artifacts do not drive clustering in the downstream analysis.

Variables provided in vars.to.regress are individually regressed against each feature, and the resulting residuals are then scaled and centered. This step allows controling for factors that may bias clustering.

Count depth effect
Although normalization scales count data to render gene counts comparable between cells, a count depth effect often remains in the data as no scaling method can infer the expression values of genes that were not detected. This count depth effect can be both a biological and a technical artefact (due to poor sampling).
SCTransform regresses out variation due to the number of UMIs by default.

Cell cycle score
Cell cycle phase may be a source of significant variation in the data. It is essential to check, how much the gene expression profiles in the data reflect the cell cycle phases the single cells were in.

Mitochondrial gene expression
mitochondrial gene expression is another factor which can greatly influence clustering. However, if the differences in mitochondrial gene expression represent a biological phenomenon that may help to distinguish cell clusters, then we advise not regressing the mitochondrial expression.


Dimensional reduction

A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes, since expression profiles of different genes are correlated if they are involved in the same biological process. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarize a dataset, and to visualize a dataset.

We use Principal Component Analysis (PCA) to summarize a dataset, overcoming noise and reducing the data to its essential components. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualize the dataset, placing similar cells together in 2D space, see below.

PCA in a nutshell

Principal Component Analysis is a way to summarize a dataset and to reduce noise by averaging similar gene expression profiles. The information for correlated genes is compressed into single dimensions called principal components (PCs) and the analysis identifies those dimensions that capture the largest amount of variation. This process gives a more precise representation of the patterns inherent to complex and large datasets.

In a PCA, the first PC captures the greatest variance across the whole dataset. The next PC captures the greatest remaining amount of variance, and so on. This way, the top PCs are likely to represent the biological signal where multiple genes are affected by the same biological processes in a coordinated way. In contrast, random technical or biological noise that affects each gene independently are contained in later PCs. Downstream analyses can be restricted to the top PCs to focus on the most relevant biological signal and to reduce noise and unnecessary computational overhead.


Principal component analysis

Running a PCA on our object, we see how the variance can be explained.
In case a early PC dimension is split on cell-cycle genes or mitochondrial gene, we might want to regress this signal from the data, so that cell-cycle heterogeneity or expression of mitochondrial genes does not contribute to PCA or downstream analysis. However, remember, those signals could also always be informative of the biology!

p_list = Seurat::VizDimLoadings(sc, dims=1:12, reduction="pca", col=param$col, combine=FALSE, balanced=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle(xlab = paste0("PC ",i))
p =  patchwork::wrap_plots(p_list, ncol=4) + patchwork::plot_annotation("Top gene loadings of the first two PCs") 
p


Feature plots

The feature plots allow to examined the distribution of cells with specific features, here QC covariates (Number of counts and features as well as percent of mitochondrial and ribosomal reads), and, hence, to infer whether those feature might exert some influence on the localisation and distribution of cells in low dimensional space. These plot can clarify the necessity and scope of covariates filter application.

# Generate UMAP
if (!("umap" %in% list_names(sc@reductions[]))) {
  # Default UMAP
  sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot", n.neighbors=param$umap_k))
}
# Number of counts 
qc_feature = paste0("nCount_", param$assay_raw)
p1 = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature, pt.size = param$pt_size) + 
  AddStyle(title="Number of counts", legend_position="right", xlab = "UMAP 1", ylab = "UMAP 2") + 
  scale_colour_gradient(low=param$col_bg, high=param$col, trans="log10"))

# Number of features
qc_feature = paste0("nFeature_", param$assay_raw)
p2 = suppressMessages(Seurat::FeaturePlot(sc, features=qc_feature, pt.size = param$pt_size) + 
  AddStyle(title="Number of features", legend_position="right", xlab = "", ylab = "") + 
  scale_colour_gradient(low=param$col_bg, high=param$col, trans="log10"))

# Percent mitochondrial reads
p3 = Seurat::FeaturePlot(sc, features="percent_mt", cols=c(param$col_bg, param$col), pt.size = param$pt_size) + 
  AddStyle(title="% mitochondrial", legend_position="right", xlab = "", ylab = "")

# Percent ribosomal reads
p4 = Seurat::FeaturePlot(sc, features="percent_ribo", cols=c(param$col_bg, param$col), pt.size = param$pt_size) +
  AddStyle(title="% ribosomal", legend_position="right", xlab = "", ylab = "")

#p = ((p2 / p3) | (p4 / p5)) + 
#    plot_layout(widths = c(2, 2))
p = p1 | p2 | p3 | p4
p


Cell Cycle Effect

The following plots can help to evaluate whether the cell cycle phases are a major source of variation in the dataset and the necessity to regress out cell cycle effects during scaling.
An indication to consider removal of cell cycle effects would be a strong differences between samples (that are supposed to have a similar cell composition) or if cells in the same cell cycle phase form very distinctive clusters. Nevertheless, the final decision also depends on the biological system and scientific question.

How does removal of cell cycle effects affect the data?

Note that removing all signal associated to cell cycle can negatively impact downstream analysis. Cell cycle signals can be informative of the biology. For example, in differentiating processes, stem cells are quiescent and differentiated cells are proliferating (or vice versa), and removing all cell cycle effects can blur the distinction between these cells. Moreover, biological signals must be understood in context. Dependencies exist between diffferent cellular processes within the organism. Thus, correcting for one process may unintentionally mask the signal of another. Vice versa, due to a correlation of cell size and some transcriptomic effect attributed to the cell cycle (McDavid et al, 2016), correcting for cell size via normalization, also partially corrects for cell cycle effects in scRNA‐seq data.

An alternative approach is to remove the difference between G2M and S phase scores. This way, signals separating non-cycling and cycling cells will be maintained, while differences in cell cycle phase amongst proliferating cells (which are often uninteresting), will be removed. For a more detailed explanation, see the cell cycle vignette for Seurat r Cite("https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html").
How are cycle effects regressed out? Prior to calling the function CellCycleScoring the data need to be standard log normalized, since counts need to be comparable between cells. Once the data is normalized for sequencing depth, a score can be calculated for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in r Cite("10.1126/science.aad0501"), and human gene symbols are translated to gene symbols of the species of interest using biomaRt.
If specified, cell cycle effects can be removed during scaling. In the process, for each gene, Seurat models the relationship between gene expression and the S and G2M cell cycle scores. The scaled residuals of this model represent a ‘corrected’ expression matrix.
# Set up colours for cell cycle effect and add to sc object
col =  GenerateColours(num_colours=length(levels(sc$Phase)), names=levels(sc$Phase), palette="ggsci::pal_npg", alphas=1)
sc = ScAddLists(sc, lists=list(Phase=col), lists_slot="colour_lists")

# Plot umap colored by cell cycle phase
p1 = Seurat::DimPlot(sc, reduction="umap", group.by="Phase", pt.size = param$pt_size) +
  AddStyle(title="", xlab = "UMAP 1", ylab = "UMAP 2") +
  NoLegend()

# Plot umap colored by cell cycle phase
p2 = Seurat::DimPlot(sc, reduction="umap", group.by="Phase", split.by = "Phase", pt.size = param$pt_size, ncol = 1) +
  AddStyle(title="", xlab = "UMAP 1", ylab = "UMAP 2") +
  NoLegend()

× (Warning) Removing 19 cells missing data for vars requested

# Fraction of cells per cell cycle phase
p3 = ggplot(sc[[]] %>% 
              dplyr::group_by(orig.ident, Phase) %>% 
              dplyr::summarise(num_cells=length(Phase)), 
            aes(x=orig.ident, y=num_cells, fill=Phase)) + 
  geom_bar(stat="identity", position="fill") + 
  scale_y_continuous("Fraction of cells") +
  AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) + 
  theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1, size = 12)) + xlab("")

p = p1 + p2 + plot_spacer() + p3 + 
  plot_layout(widths = c(3, 1, 1, 1)) + 
  patchwork::plot_annotation(title="Cell cycle phases") 
p


Batch correction

Integration can help to remove biological differences and match shared cell types and states across datasets.
Whether samples should be combined via merging or integrating is very context dependent. It is advisable first performing a dimensionaly reduction on the merged dataset and only proceed to integration if common cell types separate due to batch effect.

Why integration? If the cells cluster by covariants, such as donors, batches, or condition, integrative analysis can help to remove biological differences, match shared cell types and states across datasets, and improve clustering and downstream analyses.
The goal of integration is to find corresponding cell states across samples. Vice versa, integration relys on the presence of at least a subset of corresponding cell states that are shared across datasets. However, given the increased degrees of freedom of non‐linear data integration approaches, this approach might lead to over‐correction of data, especially when a large proportion of cells are non-overlapping across datasets, and loss of biological resolution.
How does integration work? Integration uses the shared highly variable genes from each condition and then allignes the conditions to overlay cells that are similar between samples. The integrated values are a non-linear transformation of scale.data and saved in an additional layer. Moreover, the Seurat v5 integration procedure returns a dimensional reduction (i.e. integrated.cca) of the data that captures the shared sources of variance across multiple layers and can be used for visualization and unsupervised clustering. The harmonization of LogNormalized and SCT expression values across datasets is overall similar (see https://satijalab.org/seurat/articles/seurat5_integration and https://satijalab.org/seurat/articles/integration_introduction.html).
The workflow in detail The worklows for the different normalization and sample combination methods are as followed:
  1. LogNormalize data
    1a) LogNormalize - (CCsoring) - Merge - (Rerun CCsoring) - Scale - PCA
    1b) LogNormalize - (CCsoring) - Scale - PCA - Integrate - (Rerun CCsoring) - Scale - PCA
  2. SCTansform data
    with homogene experimental groups
    2a) (LogNormalize - CCsoring - ) SCTansform - Merge - (Rerun CCsoring) - PCA
    2b) (LogNormalize - CCsoring - ) SCTansform - PCA - Integrate - (Rerun CCsoring) - PCA
    with heterogene experimental groups
    2c) (LogNormalize - CCsoring - ) SCTansform - Merge - (Rerun CCsoring) - Rerun SCTansform - PCA
    2d) (LogNormalize - CCsoring - ) SCTansform - PCA - Integrate - (Rerun CCsoring) - Rerun SCTansform - PCA

Merging LogNormalized data (1a)
LogNormalized data are merged based according to https://satijalab.org/seurat/archive/v4.3/merge. Merging will combine the Seurat objects based on the raw count matrices and also merge the normalized data matrices. Any previously scaled data matrices are erased. As the results of this layer dependent on the expression across all cells in an object, the previous values from one object before merging with another are no longer valied. Therefore, the layer is removed and the scaling needs to be re-run after merging objects.

Merging SCTransform data (2a, 2c)
The goal of SCTransform is to perform normalization within an experiment by learning the model of technical noise (within the experiment). Running SCTransform standardizes the expression of each gene in each cell relative to all the other cells in the input matrix.
2a) If you have multiple samples that are technically noisy and you want to remove batch effects that are characterized by simple shifts in mean expression, it is recommend to run SCTransform on each sample individually. However, the samples should have roughly the same celltype compositions. The merged object has then multiple SCT models and the genes in the scale.data are the intersected genes.
2c) However, scale.data calculated based on multiple different SCTransform models are not comparable. Performing SCTransform separately on very diverging samples results in loss of the baseline differences between them (similar to a gene-wise scaling before merging). Hence, if samples are biologically heterogeneous or under different treatments and, therefore, the SCTransform models of each separate sample very diverging, SCTranform should be (re-)run on merged samples. This way, SCTransform will learn one model using the median sequencing depth of the entire dataset to calculate the corrected counts.

Integration (1b, 2b, 2d)
As integration uses the shared highly variable genes, prior to integration LogNormalize amd identification of variable genes or SCTansform is required. Depending on the integration methode, dementional reduction of the data is needed additionally (e.g. for reciprocal PCA).

In most cases, a re-scoring of the cell cycle state is recommanded or at least not disadvantageous.


Methods for sample combination

This workflow implements the following integration methods offered by Seurat offers:
- Anchor-based CCA integration (method=CCAIntegration)
- Anchor-based RPCA integration (method=RPCAIntegration)

Anchor-based integration can be applied to either log-normalized or SCTransform-normalized datasets and can be also performed as reference-based integration where one or a subset of the datasets are listed as a ‘reference’. Reference-based integration, reduces computational time.

Canonical Correlation Analysis (CCA)

CCA-based integration enables indentification of conserved cell types across datasets and integrative analysis when gene expression is shifted e.g. due to experimental conditions, disease states, or even species affiliation. If cell types are present in only one dataset, the cells will appear as a separate sample-specific cluster.

Integration comprises the following steps:
1. CCA, a form of PCA, identifies shared sources of variation (using the 3000 most variant genes from each sample) between the samples.
2. For each cell in one sample, nearest neighbors (MNNs) based on gene expression values are identified in the other sample(s). If the same match results from reciprical analysis the cell pair forms an anchor.
3. The similarity between anchor pairs, i.e. whether the adjacent cells also form corresponding anchor pairs, is assed and incorrect anchors are removed.
4. The datasets are integrated by transforming the cell expression values using anchors and corresponding scores.
Reciprocal PCA (RPCA) For RPCA, instead of identifying shared sources of variation using variant genes, each dataset is projected into the other’s PCA space. Afterwards, similarly to CCa, mutual neighbors and anchors are identified.
RPCA-based integration is significantly faster and more conservative, i.e. cells in different biological states are less likely to align. Hence, RPCA is suitable for integration of multiple datasets, of datasets with little overlay of cell types, or datasets originate from the same platform. The strength of alignment can be elevated by increasing the k.anchor parameter (default 5).


To explore the results of data merging and different integration methods, here integration is performed in low-dimensional space (streamlined (one-line) integrative analysis as described in https://satijalab.org/seurat/articles/seurat5_integration).

How does streamlined (one-line) integrative analysis work? Seurat v5 enables streamlined integrative analysis via the IntegrateLayers function.
First, the samples are merged and then the layers are split (a counts and data layer for each batch). As the the data is split into layers, normalization and variable feature identification is performed for each set independently and a consensus set of variable features is identified. Moreover, a dimensional reduction with (hopefully) co-embed shared cell types across samples is returned as output.

The following plots offer a low dimension representation of your data combined via the different methods.

### Integrate 
# On merged data sets 
# sc = merge(x=sc[[1]], y=sc[2:length(sc)], ...)
# sc = JoinLayers(sc)
# sc = RunPCA(sc, ...)

# Split RNA layer(s)
sc[["RNA"]] <- split(sc[["RNA"]], f = sc$orig.ident)

# As in https://satijalab.org/seurat/articles/integration_introduction.html
# and https://satijalab.org/seurat/articles/seurat5_integration
# Integration functions:
integration_function = c("CCAIntegration", "RPCAIntegration")
reduction_name = c("integrated.cca", "integrated.rpca")

if (param$norm == "RNA") { 
  for (i in seq(integration_function)) {
    sc = IntegrateLayers(object = sc, 
                         method = integration_function[i],
                         orig.reduction = "pca", 
                         normalization.method = "LogNormalize",
                         features = VariableFeatures(sc),
                         new.reduction = reduction_name[i],
                         verbose = FALSE)
    sc = RunUMAP(sc, reduction = reduction_name[i], dims = 1:30, reduction.name = paste0(gsub("integrated", "umap", reduction_name[i])))
  }
  
} else if (param$norm == "SCT") {
  for (i in seq(integration_function)) {
    sc = IntegrateLayers(object = sc, 
                         method = integration_function[i], 
                         normalization.method = "SCT",  
                         features = VariableFeatures(sc),
                         new.reduction = reduction_name[i], 
                         verbose = FALSE)
    
    sc = RunUMAP(sc, reduction = reduction_name[i], dims = 1:30, reduction.name = paste0(gsub("integrated", "umap", reduction_name[i])))
  }
}

# Re-join RNA layer(s)
sc[["RNA"]] = JoinLayers(sc[["RNA"]])
### Score plots
# PCA score plots colored by sample
p1 = Seurat::DimPlot(sc, reduction="pca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(1,2)) + 
  AddStyle(legend_position="none", title = "", xlab = "PC 1", ylab = "PC 2")
p2 = Seurat::DimPlot(sc, reduction="pca", group.by = "orig.ident", cols=param$col_samples,  pt.size = 1, dims = c(3,4)) + 
  AddStyle(legend_position="none", title = "", xlab = "PC 3", ylab = "PC 4")

# CCA integrated score plots colored by sample
p3 = Seurat::DimPlot(sc, reduction="integrated.cca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(1,2)) + 
  AddStyle(legend_position="none", title = "", xlab = "PC 1", ylab = "PC 2")
p4 = Seurat::DimPlot(sc, reduction="integrated.cca", group.by = "orig.ident", cols=param$col_samples,  pt.size = 1, dims = c(3,4)) + 
  AddStyle(legend_position="none", title = "", xlab = "PC 3", ylab = "PC 4")

# RPCA integrated score plots colored by sample
p5 = Seurat::DimPlot(sc, reduction="integrated.rpca", group.by = "orig.ident", cols=param$col_samples, pt.size = 1, dims = c(1,2)) + 
  AddStyle(legend_position="none", title = "", xlab = "PC 1", ylab = "PC 2")
p6 = Seurat::DimPlot(sc, reduction="integrated.rpca", group.by = "orig.ident", cols=param$col_samples,  pt.size = 1, dims = c(3,4)) + 
  AddStyle(legend_position="none", title = "", xlab = "PC 3", ylab = "PC 4")

### Umaps
# Plot umap colored by sample
p7 = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", pt.size = param$pt_size, cols = param$col_samples) +
  AddStyle(title="", legend_position="right", xlab = "UMAP 1", ylab = "UMAP 2")

# Plot CCA umap colored by sample
p8 = Seurat::DimPlot(sc, reduction="umap.cca", group.by="orig.ident", pt.size = param$pt_size, cols = param$col_samples) +
  AddStyle(title="", legend_position="right", xlab = "UMAP 1", ylab = "UMAP 2")

# Plot RPCA umap colored by sample
p9 = Seurat::DimPlot(sc, reduction="umap.rpca", group.by="orig.ident", pt.size = param$pt_size, cols = param$col_samples) +
  AddStyle(title="", legend_position="right", xlab = "UMAP 1", ylab = "UMAP 2")

# Combine plots
p_list = list()
p_list[[1]] = (p1 / p2)
p_list[[1]] = (p_list[[1]] | p7) +
  plot_layout(widths = c(1, 3)) + 
  plot_annotation(title = 'Merged') 
p_list[[2]] = (p3 / p4)
p_list[[2]] = (p_list[[2]] | p8) +
  plot_layout(widths = c(1, 3)) + 
  plot_annotation(title = 'CCA integrated') 
p_list[[3]] = (p5 / p6)
p_list[[3]] = (p_list[[3]] | p9) +
  plot_layout(widths = c(1, 3)) + 
  plot_annotation(title = 'RPCA integrated') 

Merged

p_list[[1]]




CCA integrated

p_list[[2]]




RPCA integrated

p_list[[3]]




Determinig dimensionality of the dataset

Now we need to decide how many PCs we want to use for our analyses.

PCs include biological signal as well as noise, and we need to determine the number of PCs with which we include as much biological signal as possible and as little noise as possible. The following “Elbow plot” is designed to help us make an informed decision.

How do we determine the number of PCs for downstream analysis? The Elbow plot shows PCs ranked based on the percentage of variance they explain. The top PC captures the greatest variance across cells and each subsequent PC represents decreasing levels of variance. By visual inspection of the Elbow plot, we try to find the point at which we can explain most of the variance across cells. Commonly, the top 10 PCs are chosen. It may be helpful to repeat downstream analyses with a different number of PCs, although the results often do not differ dramatically. Note that it is recommended to rather choose too many than too few PCs.
# More approximate technique used to reduce computation time
p = Seurat::ElbowPlot(sc, ndims=min(20, ncol(sc))) + 
  geom_vline(xintercept=param$pc_n + .5, col="firebrick", lty=2) + 
  AddStyle(title="Elbow plot") 
p

# Cannot have more PCs than number of cells
param$pc_n = min(param$pc_n, ncol(sc))





Parameters and software versions

The following parameters were used to run the workflow.

Parameters
out = ScrnaseqParamsInfo(params=param)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")
Name Value
path_to_git /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis
scriptname modules/pre-processing/qc.Rmd
author kosankem
assay_raw RNA
downsample_cells_equally FALSE
cell_filter sample1:nFeature_RNA=c(NA, NA), nCount_RNA=c(NA, NA), percent_mt=c(NA, NA); sample2:nFeature_RNA=c(NA, NA), nCount_RNA=c(NA, NA), percent_mt=c(NA, NA)
feature_filter sample1:min_counts=1, min_cells=5; sample2:min_counts=1, min_cells=5
samples_min_cells 10
norm RNA
cc_remove FALSE
cc_remove_all FALSE
cc_rescore_after_merge TRUE
experimental_groups homogene
integrate_samples method:merge
pc_n 30
cluster_k 20
umap_k 30
cluster_resolution_test 0.3, 0.7
cluster_resolution 0.5
annot_version 98
annot_main ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession
mart_attributes ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description
col_palette_samples ggsci::pal_lancet
col_palette_clusters ggsci::pal_igv
col_palette_annotation ggsci::pal_igv
col #0086b3
col_bg #D3D3D3
pt_size 0.5
project_id Testdata
path_data name:sample1, sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample1/, /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample2/
path_out /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/qc
species human
debugging_mode default_debugging
mart_dataset hsapiens_gene_ensembl
mt ^MT-
path_reference /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98
reference hsapiens_gene_ensembl.v98.annot.txt
file_annot /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/hsapiens_gene_ensembl.v98.annot.txt
file_cc_genes /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/cell_cycle_markers.xlsx
col_samples sample1=#00468BFF, sample2=#ED0000FF
downsample_cells_n 500

This report was generated using the modules/pre-processing/qc.Rmd script. Software versions were collected at run time.

Software versions
out = ScrnaseqSessionInfo(param$path_to_git)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Name Value
Run on: Fri May 17 05:43:37 PM 2024
ktrns/scrnaseq 9f5e35d534afb1f722e5ece13cedc8bbbb84d3b6
Container NA
R R version 4.2.1 (2022-06-23)
Platform x86_64-pc-linux-gnu (64-bit)
Operating system Ubuntu 20.04.6 LTS
Host name hpc-rc11
Host OS #138-Ubuntu SMP Wed Jun 22 15:00:31 UTC 2022 (5.4.0-122-generic)
Packages abind1.4-5, backports1.4.1, beeswarm0.4.0, bibtex0.5.1, bslib0.6.1, cachem1.0.8, cli3.6.2, cluster2.1.3, codetools0.2-18, colorspace2.1-0, cowplot1.1.3, curl5.2.1, data.table1.15.2, deldir2.0-4, digest0.6.34, dotCall641.1-1, dplyr1.1.4, ellipsis0.3.2, evaluate0.23, fansi1.0.6, farver2.1.1, fastDummies1.7.3, fastmap1.1.1, fitdistrplus1.1-11, future1.33.1, future.apply1.11.1, generics0.1.3, ggbeeswarm0.7.2, ggplot23.5.0, ggrastr1.0.2, ggrepel0.9.5, ggridges0.5.6, ggsci3.0.1, globals0.16.2, glue1.7.0, goftest1.2-3, gridExtra2.3, gtable0.3.4, highr0.10, htmltools0.5.7, htmlwidgets1.6.4, httpuv1.6.14, httr1.4.7, ica1.0-3, igraph2.0.2, irlba2.3.5.1, jquerylib0.1.4, jsonlite1.8.8, kableExtra1.4.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.45, labeling0.4.3, later1.3.2, lattice0.20-45, lazyeval0.2.2, leiden0.4.3.1, lifecycle1.0.4, listenv0.9.1, lmtest0.9-40, lubridate1.9.3, magrittr2.0.3, MASS7.3-58, Matrix1.6-5, matrixStats1.1.0, mime0.12, miniUI0.1.1.1, munsell0.5.0, nlme3.1-157, openxlsx4.2.5.2, parallelly1.37.1, patchwork1.2.0, pbapply1.7-2, pillar1.9.0, pkgconfig2.0.3, plotly4.10.4, plyr1.8.9, png0.1-8, polyclip1.10-6, progressr0.14.0, promises1.2.1, purrr1.0.2, R.methodsS31.8.2, R.oo1.26.0, R.utils2.12.3, R62.5.1, RANN2.6.1, RColorBrewer1.1-3, Rcpp1.0.12, RcppAnnoy0.0.22, RcppHNSW0.6.0, RefManageR1.4.0, renv0.16.0, reshape21.4.4, reticulate1.35.0, rlang1.1.3, rmarkdown2.25, ROCR1.0-11, RSpectra0.16-1, rstudioapi0.15.0, Rtsne0.17, sass0.4.8, scales1.3.0, scattermore1.2, sctransform0.4.1, sessioninfo1.2.2, Seurat5.0.2, SeuratObject5.0.1, shiny1.8.0, sp2.1-3, spam2.10-0, spatstat.data3.0-4, spatstat.explore3.2-6, spatstat.geom3.2-9, spatstat.random3.2-3, spatstat.sparse3.0-3, spatstat.utils3.0-4, stringi1.8.3, stringr1.5.1, survival3.2-13, svglite2.1.3, systemfonts1.0.5, tensor1.5, tibble3.2.1, tidyr1.3.1, tidyselect1.2.0, timechange0.3.0, utf81.2.4, uwot0.1.16, vctrs0.6.5, vipor0.4.7, viridisLite0.4.2, withr3.0.0, xfun0.41, xml21.3.6, xtable1.8-4, yaml2.3.8, zip2.3.1, zoo1.8-12


Credits and References

This Workflow was developed as module of the sc_analysis workflow at the Research Core Unit Genomics, Hannover Medical School (Hannover, Germany). Seurat Vignettes were initially used as templates (Hao et al. (2023), Hao et al. (2021)).

# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file=file.path(param$path_out, "references.bib"))
Gandolfo, Luke C., and Terence P. Speed. 2018. “RLE Plots: Visualizing Unwanted Variation in High Dimensional Data.” Edited by Enrique Hernandez-Lemus. PLOS ONE 13 (2): e0191629. https://doi.org/10.1371/journal.pone.0191629.
Hao, Yuhan, Stephanie Hao, Erica Andersen-Nissen, William M. Mauck III, Shiwei Zheng, Andrew Butler, Maddie J. Lee, et al. 2021. “Integrated Analysis of Multimodal Single-Cell Data.” Cell. https://doi.org/10.1016/j.cell.2021.04.048.
Hao, Yuhan, Tim Stuart, Madeline H Kowalski, Saket Choudhary, Paul Hoffman, Austin Hartman, Avi Srivastava, et al. 2023. “Dictionary Learning for Integrative, Multimodal and Scalable Single-Cell Analysis.” Nature Biotechnology. https://doi.org/10.1038/s41587-023-01767-y.